### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2
options(stringsAsFactors=FALSE,
dplyr.summarise.inform=FALSE,
future.globals.maxSize=min(memInKB, 20 * 1024^3),
mc.cores=min(cpus,1),
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R",
"functions_biomart.R",
"functions_degs.R",
"functions_io.R",
"functions_plotting.R",
"functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions
param$debugging_mode = "default_debugging"
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())### Rmarkdown configuration
################################################################################
### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
### R Options
options(citation_format="pandoc",
knitr.table.format="html",
kableExtra_view_html=TRUE)
### Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
results = "hold", # show results after running whole chunk
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source='fold-hide', # by default collapse code blocks
dev=c('png', 'svg', 'tiff'), # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
dev.args=list(png = list(type="cairo"), # png: use cairo - works on cluster
tiff = list(type="cairo", compression = 'zip')), # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’
dpi=300, # figure resolution
fig.retina=2, # retina multiplier
fig.path=paste0(file.path(param$path_out, "figures"), "/") # Path for figures in png and pdf format (trailing "/" is needed)
)
### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
### Required libraries
library(magrittr)
### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()
# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
bib_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","sc_analysis_references.bib"))
invisible(knitcitations::citep(bib_ref))
### Figure heights
# high figures
fig_large_height = 8
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4# Load renv and virtualenvs
renv::load(file.path(param$path_to_git,"env/basic"))
renv::use_python(type = "virtualenv", name = file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
#reticulate::use_virtualenv(file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(knitr)
library(enrichR)Gene annotation including Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names, are loaded from a pre-prepared reference file or Ensembl.
## Read gene annotation
# We read gene annotation from file.
# We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names.
# ATTENTION: ONLY for human and mouse!!!
### Set reference
################################################################################
if (param$species=="human") {
param$mart_dataset = ifelse(is.null(param$mart_dataset), "hsapiens_gene_ensembl", param$mart_dataset)
param$mt = ifelse(is.null(param$mt), "^MT-", param$mt)
param$enrichr_dbs = if(is.null(param$enrichr_dbs)) {
c("GO_Biological_Process_2023", "WikiPathway_2023_Human", "Azimuth_2023", "CellMarker_2024")
}
param$annotation_dbs = ifelse(is.null(param$annotation_dbs), "BlueprintEncodeData()", param$annotation_dbs)
} else {
if (param$species=="mouse") {
param$mart_dataset = ifelse(is.null(param$mart_dataset), "mmusculus_gene_ensembl", param$mart_dataset)
param$mt = ifelse(is.null(param$mt), "^mt-", param$mt)
param$enrichr_dbs = if(is.null(param$enrichr_dbs)) {
c("GO_Biological_Process_2023", "WikiPathways_2019_Mouse", "Azimuth_2023", "CellMarker_2024")
}
param$annotation_dbs = ifelse(is.null(param$annotation_dbs), "MouseRNAseqData()", param$annotation_dbs)
} else {
param$mart_dataset=param$mart_dataset
}
}
# Set defaults
# Default is Ensembl release 98 which corresponds to 2020-A reference package of 10x Genomics Cell Ranger
param$annot_version = ifelse(is.null(param$annot_version), 98, param$annot_version)
param$annot_main = if(is.null(param$annot_main)) {
c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
}
param$mart_attributes = if(is.null(param$mart_attributes)) {
c(c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession"),
c("chromosome_name", "start_position", "end_position", "percentage_gene_gc_content", "gene_biotype", "strand", "description"))
}
param$path_reference = file.path(param$path_to_git, "references", param$mart_dataset, param$annot_version)
param$reference = paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt")
param$file_annot = ifelse(is.null(param$file_annot), file.path(param$path_reference, param$reference), param$file_annot)
param$file_cc_genes = ifelse(is.null(param$file_cc_genes), file.path(param$path_reference, "cell_cycle_markers.xlsx"), param$file_cc_genes)
### Download reference if not existing
################################################################################
if (!file.exists(param$file_annot) | !file.exists(param$file_cc_genes)) {
param$file_annot = file.path(param$path_reference, param$reference)
param$file_cc_genes = file.path(param$path_reference, "cell_cycle_markers.xlsx")
source(file.path(param$path_to_git, "modules/download_references/download_references.R"))
}
### read_ensembl_annotation
################################################################################
# Load from file
annot_ensembl = read.delim(param$file_annot)
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Translate IDs
IDs_out = suppressWarnings(TranslateIDs(annot_ensembl, param$annot_main))
ensembl_to_seurat_rowname = IDs_out[[1]]
seurat_rowname_to_ensembl = IDs_out[[2]]
seurat_rowname_to_entrez = IDs_out[[3]]
annot_ensembl = IDs_out[[4]]
### read_cc_genes
################################################################################
# Use biomart to translate human cell cycle genes to the species of interest
# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)
# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))The workflow is appicable to single cell and nuclei RNAseq data pre-processed via 10x Genomics or SmartSeq-2 or for other data that are represented by a simple table with transcript counts per gene and cell. In the first step, a Seurat object of the data is generated and subsequently some metadata are added. Similarly, a Seurat object can be loaded to inspect the stored scRNA seq data.
Here, for the project Testdata, the following data are analysed:
### Read rds object
################################################################################
### Load Seurat S4 objects
# Test if file is defined
if (is.null(param$data)) {
message("Dataset is not specified")
} else {
# Test if file exists
if (file.exists(param$data)) {
# Read object
message(paste0("Load dataset:", param$data))
sc = base::readRDS(param$data)
# Transfer original params to loaded object
if ("parameters" %in% list_names(sc@misc[])) {
# Retrieve previous parameter settings
orig_param = sc@misc$parameters
if ("SCT" %in% names(sc@assays)) {
if ("scale.data" %in% Layers(sc[["SCT"]])) {
orig_param$norm = "SCT"
} else {
orig_param$norm = "RNA"
}
} else {
orig_param$norm = "RNA"
}
# Keep some parameter settings from object and project defined
orig_param_keep = orig_param[c("annot_version", "species")]
basic_param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
# Test for reference concordance
rerun_read_gene_annotation = (orig_param$species == param$species & orig_param$annot_version == param$annot_version)
# Integrate parameter
param = modifyList(x = param, val = orig_param)
param = modifyList(x = param, val = basic_param_keep)
param = modifyList(x = param, val = param_advset)
param = modifyList(x = param, val = orig_param_keep)
}
# Rerun read_gene_annotation.R if not fitting to loaded object
if (!rerun_read_gene_annotation == TRUE) {
source(file.path(param$path_to_git,'scripts/read_data/read_gene_annotation.R'))
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(sc@meta.data[["orig.ident"]])) {
if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples))
sc = sample_colours_out[[1]]
param$col_samples = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(sc@meta.data[["seurat_clusters"]])) {
if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters))
sc = cluster_colours_out[[1]]
param$col_clusters = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(sc@meta.data[["annotation"]])) {
if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation))
sc = annotation_colours_out[[1]]
param$col_annotation = annotation_colours_out[[2]]
}
}
sc
} else {
message("Dataset does not exist")
}
}×
(Message)
Load
dataset:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/pre-processing/data/sc.rds
×
(Message)
No or wrong number of distinct colors
for clusters provieded. Generate colors using color
paletteggsci::pal_igv
### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
# Test if file exists
if (file.exists(param$refdata)) {
# Read object
message(paste0("Load dataset:", param$refdata))
scR = base::readRDS(param$refdata)
# Transfer original params to loaded object
if ("parameters" %in% list_names(scR@misc[])) {
orig_paramR = scR@misc$parameters
if (!is.null(orig_paramR$col_samples)) {
param$col_samples_ref = orig_paramR$col_samples
}
if (!is.null(orig_paramR$col_clusters)) {
param$col_clusters_ref = orig_paramR$col_clusters
}
if (!is.null(orig_paramR$col_annotation)) {
param$col_annotation_ref = orig_paramR$col_annotation
}
param = modifyList(x = param, val = param_advset)
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(scR@meta.data[["orig.ident"]])) {
if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples))
scR = sample_colours_out[[1]]
param$col_samples_ref = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(scR@meta.data[["seurat_clusters"]])) {
if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters))
scR = cluster_colours_out[[1]]
param$col_clusters_ref = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(scR@meta.data[["annotation"]])) {
if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation))
scR = annotation_colours_out[[1]]
param$col_annotation_ref = annotation_colours_out[[2]]
}
}
scR
} else {
message("Reference dataset does not exist")
}
} ## An object of class Seurat
## 6439 features across 1078 samples within 1 assay
## Active assay: RNA (6439 features, 3000 variable features)
## 3 layers present: data, counts, scale.data
## 2 dimensional reductions calculated: pca, umap
# Confirm correct setting of normalization method
# Normalization method can not be overwritten with advanced_settings as long as the object was not normalized using the respective method
if (param$norm %in% names(sc@assays)) {
if (param$norm == "RNA") {
if ("scale.data" %in% Layers(sc[["RNA"]])) {
param$norm = "RNA"
} else {
param$norm = "SCT"
}
} else if (param$norm == "SCT") {
if ("scale.data" %in% Layers(sc[["SCT"]])) {
param$norm = "SCT"
} else {
param$norm = "RNA"
}
} else {
param$norm = orig_param$norm
}
} else {
param$norm = "RNA"
}
message(paste0("Chosen normalisation method and normlized values detecte in provided object were harmonised and set to: ", ifelse(param$norm=="RNA", "Standard log normalisation", "SCTransform")))×
(Message)
Chosen normalisation method and
normlized values detecte in provided object were harmonised and set to:
Standard log normalisation
The clustering method used by Seurat first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm (Traag et al. (2019)).
At this point, we would like to define subpopulations of cells with similar gene expression profiles using unsupervised clustering. Clusters ultimately serve as approximations for biological objects like cell types or cell states.
During the first step of clustering, a K-nearest neighbor (KNN) graph is constructed. In simplified terms this means that cells are connected to their K nearest neighbors based on cell-to-cell expression similarity using the PCs chosen in the previous step. The higher the similarity is, the higher the edge weight becomes. During the second step, the graph is partitioned into highly interconnected communities, whereby each community represents a cluster of cells with similar expression profiles. The separation of the graph into clusters is dependent on the chosen resolution. For scRNA-seq datasets of around 3000 cells, it is recommended to use a resolution value between 0.4 and 1.2. This value can be set even higher for larger datasets. Note that the choice of PCs and cluster resolution is an arbitrary one. Therefore, it is highly recommended to evaluate clusters and re-run the workflow with adapted parameters if needed.# Set default clustering
n = paste0(DefaultAssay(sc), "_snn_res.", param$cluster_resolution)
sc$seurat_clusters = sc[[n, drop=TRUE]]
Seurat::Idents(sc) = sc$seurat_clusters
if (length(levels(sc$seurat_clusters)) > 1) {
suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), list(seurat_clusters = Seurat::Misc(sc, "trees")[[n]]))})
}
# Set up colors for default clustering
sc = ScAddLists(sc, lists=list(seurat_clusters=Misc(sc, "colour_lists")[[n]]), lists_slot="colour_lists")
param$col_clusters = Misc(sc, "colour_lists")[["seurat_clusters"]]We use a UMAP to visualise and explore the dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see Coenen and Pearce (2024).
For this report, you chose the resolution value 0.6 as the final value
for further analyses.
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p_umap = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters") +
scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="Cells coloured by cluster identity", legend_position="", legend_title="", xlab = "UMAP 1", ylab = "UMAP 2")
p_umap = LabelClusters(p_umap, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
sample_cells = table(sc$orig.ident)
sample_labels = paste0(levels(sc$orig.ident)," (", sample_cells[levels(sc$orig.ident)],")")
# Note: This is a hack to colour by sample but label by Cluster
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident") +
scale_color_manual(values=param$col_samples, labels=sample_labels) +
AddStyle(title="Cells coloured by sample of origin", legend_position="bottom", xlab = "UMAP 1", ylab = "UMAP 2")
p2$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p2$data), ]
p2 = LabelClusters(p2, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p = p_umap + p2
p# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="Cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2")
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
scale_color_manual(values=param$col_samples, labels=sample_labels) +
AddStyle(title="Cells coloured by sample of origin", legend_position="bottom", xlab = "UMAP 1", ylab = "UMAP 2")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p
Exploration of quality control metrics: determine whether clusters are unbalanced in their number of counts, detected genes or mitochondrial content.
p_list1 = list()
qc_feature = c(paste0("nCount_", param$assay_raw), paste0("nFeature_", param$assay_raw), "percent_mt", "percent_ribo")
title = c(paste0("Summed raw counts (nCount_", param$assay_raw, ", log10 scale)"),
paste0("Number of features with raw count > 0 (nFeature_", param$assay_raw, ", log10 scale)"),
"Percent of mitochondrial features (percent_mt)",
"Percent of ribosomal features (percent_ribo)")
if ("percent_ercc" %in% colnames(sc[[]])) {
qc_feature = c(qc_feature, "percent_ercc")
title = c(title, "Percent of ERCC features (percent_ercc)")
}
for (n in seq(qc_feature)) {
p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature[n]) +
AddStyle(title="Feature plot") +
scale_colour_gradient(low="lightgrey", high=param$col))
if (qc_feature[n]==paste0("nCount_", param$assay_raw) | qc_feature[n]==paste0("nFeature_", param$assay_raw)) {
p1 = p1 + scale_colour_gradient(low="lightgrey", high=param$col, trans="log10")
}
p1$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p1$data), ]
p1 = LabelClusters(p1, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p2 = ggplot(sc[[]], aes(x=.data[["seurat_clusters"]], y=.data[[qc_feature[n]]], fill=.data[["seurat_clusters"]], group=.data[["seurat_clusters"]])) +
geom_violin(scale="width") +
AddStyle(title="Violin plot (log10 scale)", fill=param$col_clusters,
xlab="Cluster", legend_position="none")
if (qc_feature[n]==paste0("nCount_", param$assay_raw) | qc_feature[n]==paste0("nFeature_", param$assay_raw)) {
p2 = p2 + scale_y_log10()
}
m = c(qc_feature[n], paste0("nFeature_", param$assay_raw))
p3 = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["seurat_clusters"]])) +
geom_point() +
AddStyle(col=param$col_clusters) +
facet_wrap(~seurat_clusters) +
theme(legend.position = "none")
p_list1[[n]] = (p1 | p2) / p3
p_list1[[n]] = p_list1[[n]] + patchwork::plot_annotation(title=title[n])
}print(p_list1[[1]])print(p_list1[[2]])print(p_list1[[3]])print(p_list1[[4]])if ("percent_ercc" %in% colnames(sc[[]])) {
print(p_list1[[5]])
} else {
message("No ERCC controls found.")
}×
(Message)
No ERCC controls found.
How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? If a cell cycle score for each cell based on its expression of G2M and S phase markers was calculated, it will be displayed here.
# Set up colours for cell cycle effect and add to sc object
col = GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
sc = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")
# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) +
geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score") +
AddStyle(col=Misc(sc, "colour_lists")[["Phase"]]) +
NoLegend()
p2 = ggplot(sc@meta.data %>%
dplyr::group_by(seurat_clusters, Phase) %>%
dplyr::summarise(num_cells=length(Phase)),
aes(x=seurat_clusters, y=num_cells, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells") +
AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) +
NoLegend() +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))
p3 = ggplot(sc[[]] %>%
dplyr::group_by(orig.ident, Phase) %>%
dplyr::summarise(num_cells=length(Phase)),
aes(x=orig.ident, y=num_cells, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_y_continuous("Fraction of cells") +
AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) +
NoLegend() +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + xlab("")
p23 = p2 + p3 + patchwork::plot_layout(guides = "collect") & theme(legend.position = 'bottom')
p = p1 + p23
pif (any(!is.na(sc$Phase))) {
p1 = Seurat::DimPlot(sc, group.by="Phase", pt.size=1, cols=Misc(sc, "colour_lists")[["Phase"]]) +
AddStyle(title="Cell cycle phases", legend_title="Phase", xlab = "UMAP 1", ylab = "UMAP 2")
p1$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p1$data), ]
p1 = LabelClusters(p1, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p_list = list()
cc_features = c("S.Score", "G2M.Score", "CC.Difference")
title = c("S phase", "G2/M phase", "G2/M & S difference")
for (n in seq(cc_features)) {
p_list[[n]] = Seurat::FeaturePlot(sc, features=cc_features[n], pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
AddStyle(title=title[n], xlab = "", ylab = "")
}
p = p1 + patchwork::wrap_plots(p_list, ncol=1) +
plot_layout(widths = c(3, 1)) +
patchwork::plot_annotation(title="UMAP coloured by")
p
}
# Count cells per cluster per sample
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% levels()
cell_clusters = sc[[]] %>% dplyr::pull(seurat_clusters) %>% levels()
# Make table
tbl = dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", names_prefix="Cl_", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"],"_n")
tbl[,"orig.ident"] = NULL
# Add percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t()
rownames(tbl_perc) = gsub(rownames(tbl_perc), pattern="_n$", replacement="_perc", perl=TRUE)
tbl = rbind(tbl, tbl_perc)
# Add enrichment
if (length(cell_samples) > 1 & length(cell_clusters) > 1) tbl = rbind(tbl, CellsFisher(sc))
# Sort
tbl = tbl[order(rownames(tbl)),, drop=FALSE]
# Plot percentages
tbl_bar = tbl[paste0(cell_samples, "_perc"), , drop=FALSE] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(tidyr::starts_with("Cl"), names_to="Cluster", values_to="Percentage")
tbl_bar$Cluster = tbl_bar$Cluster %>% gsub(pattern="^Cl_", replacement="", perl=TRUE) %>% factor(levels=sc$seurat_clusters %>% levels())
tbl_bar$Sample = tbl_bar$Sample %>% gsub(pattern="_perc$", replacement="", perl=TRUE) %>% as.factor()
tbl_bar$Percentage = as.numeric(tbl_bar$Percentage)
p1 = ggplot(tbl_bar, aes(x=Cluster, y=Percentage, fill=Sample)) +
geom_bar(stat="identity" ) +
AddStyle(title="Sample identity per clusters",
fill=param$col_samples,
legend_title="Sample",
legend_position="bottom")
p = p1 | p_umap
pThe following table shows the number of cells per sample and cluster:
In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher or lower than expected:
# Print table
knitr::kable(tbl, align="l", caption="Number of cells per sample and cluster") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", fixed_thead=TRUE)| Cl_1 | Cl_2 | Cl_3 | Cl_4 | Cl_5 | Cl_6 | Cl_7 | |
|---|---|---|---|---|---|---|---|
| Sample1_n | 88 | 88 | 75 | 72 | 54 | 50 | 18 |
| Sample1_perc | 41.51 | 44.22 | 42.37 | 42.35 | 37.76 | 41.67 | 31.58 |
| Sample1.oddsRatio | 1.01 | 1.16 | 1.06 | 1.05 | 0.84 | 1.02 | 0.64 |
| Sample1.p | 5.0e-01 | 2.0e-01 | 4.0e-01 | 4.1e-01 | 8.4e-01 | 5.0e-01 | 9.5e-01 |
| Sample2_n | 124 | 111 | 102 | 98 | 89 | 70 | 39 |
| Sample2_perc | 58.49 | 55.78 | 57.63 | 57.65 | 62.24 | 58.33 | 68.42 |
| Sample2.oddsRatio | 0.99 | 0.86 | 0.95 | 0.95 | 1.18 | 0.98 | 1.56 |
| Sample2.p | 5.6e-01 | 8.4e-01 | 6.6e-01 | 6.5e-01 | 2.0e-01 | 5.8e-01 | 8.1e-02 |
# Make table
tbl = dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"])
tbl[,"orig.ident"] = NULL
# Plot counts
tbl_bar = tbl[cell_samples, , drop=FALSE] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(cols = tidyselect::all_of(cell_clusters), names_to="Cluster", values_to="Counts")
tbl_bar$Counts = as.numeric(tbl_bar$Counts)
tbl_bar$Cluster <- factor(tbl_bar$Cluster,levels = cell_clusters)
tbl_bar$Sample <- factor(tbl_bar$Sample, levels = cell_samples)
# Plot counts per condition
p1 = ggplot(tbl_bar, aes(x=Sample, y=Counts, fill=Cluster)) +
geom_bar(stat="identity" ) +
AddStyle(title="", xlab = "",
fill=param$col_clusters,
legend_position="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Calculate percentages
tbl_perc = round(t(tbl) / colSums(t(tbl)) * 100, 2) %>% t() %>% as.data.frame()
tbl_perc = round(tbl / rowSums(tbl) * 100, 2) %>% as.data.frame()
# Plot percentages
tbl_bar_perc = tbl_perc[cell_samples, , drop=FALSE] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(cols = tidyselect::all_of(cell_clusters), names_to="Clusters", values_to="Percentage")
tbl_bar_perc$Percentage = as.numeric(tbl_bar_perc$Percentage)
tbl_bar_perc$Clusters <- factor(tbl_bar_perc$Clusters,levels = cell_clusters)
tbl_bar_perc$Sample <- factor(tbl_bar_perc$Sample, levels = cell_samples)
p2 = ggplot(tbl_bar_perc, aes(x=Sample, y=Percentage, fill=Clusters)) +
geom_bar(stat="identity" ) +
AddStyle(title="", xlab = "",
fill=param$col_clusters,
legend_position="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p_umap = p_umap +
AddStyle(title="")
p = p1 | p2 +
patchwork::plot_annotation(title="Cluster identity per sample")
p = p | p_umap
p# Combine tables
rownames(tbl) = paste0(rownames(tbl),"_n")
rownames(tbl_perc) = paste0(rownames(tbl_perc),"_perc")
tbl = rbind(tbl, tbl_perc)
# Sort
tbl = tbl[order(rownames(tbl)),, drop=FALSE]
# Print table
knitr::kable(tbl, align="l", caption="Number of cells per cluster and sample") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", fixed_thead=TRUE)| 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|
| Sample1_n | 88.00 | 88.00 | 75.00 | 72.00 | 54.00 | 50.00 | 18.00 |
| Sample1_perc | 19.78 | 19.78 | 16.85 | 16.18 | 12.13 | 11.24 | 4.04 |
| Sample2_n | 124.00 | 111.00 | 102.00 | 98.00 | 89.00 | 70.00 | 39.00 |
| Sample2_perc | 19.59 | 17.54 | 16.11 | 15.48 | 14.06 | 11.06 | 6.16 |
# Reset default assay, so we won't plot integrated data
# Note: We need integrated data for UMAP, clusters
DefaultAssay(sc) = ifelse(param$norm=="SCT", param$norm, param$assay_raw)Here, the relationship (similarity) between clusters is represented in a cluster tree. The cluster tree can help to identify which clusters might be candidates for merging.
# Set seurat_clusters for as default tree
sc@tools$BuildClusterTree = sc@misc$trees$seurat_clusters
# pull the tree
cluster_tree <- Tool(object = sc, slot = "BuildClusterTree")
cluster_tree$tip.label <- paste0(" Cluster ", cluster_tree$tip.label)
# plot the tree
p1 = wrap_elements(full= ~ ape::plot.phylo(x = cluster_tree, type = "c", use.edge.length = TRUE, font = 2, cex = 1.5, tip.color = sc@misc$colour_lists$seurat_clusters, edge.width = 2))
p2 = p_umap / plot_spacer() +
plot_layout(widths = c(1, 2))
p_list = list(p1, plot_spacer(), p2)
p = wrap_plots(p_list) +
plot_layout(widths = c(10, 1, 4))
pIn contrast, the silhouette plot can aid in assessing cluster assignment.
The silhouette plot visualizes the separation distance between clusters, i.e. how close each point in one cluster is to points in the neighboring clusters, measured as a silhouette score or coefficients. This score has a range of [-1, 1]. A score near +1 indicate that the sample is far away from the neighboring clusters. A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and negative values indicate that those samples might have been assigned to the wrong cluster. Hence, the silhouette plot can aid in assignment cluster assignement and confirming the choice of cluster resolution. Each cluster would ideally contain large positive silhouette widths, indicating that it is well-separated from other clusters. In contrast, smaller widths can arise from the presence of internal subclusters or overclustering.
# Extracting distance matrix
distance_matrix = dist(Embeddings(sc[['pca']])[, 1:param$pc_n])
clusters = sc@meta.data$seurat_clusters
silhouette = cluster::silhouette(as.numeric(clusters), dist = distance_matrix)
sc@meta.data$silhouette_score = silhouette[,3]
mean_silhouette_score = mean(sc@meta.data$silhouette_score)
# Generate table
silhouette_tbl = sc@meta.data %>%
dplyr::mutate(barcode = rownames(.)) %>%
dplyr::arrange(seurat_clusters,-silhouette_score) %>%
dplyr::mutate(barcode = factor(barcode, levels = barcode))
# Plot
legend_title <- "Cluster"
p = ggplot(silhouette_tbl) +
geom_col(aes(barcode, silhouette_score, fill = seurat_clusters), show.legend = TRUE) +
geom_hline(yintercept = mean_silhouette_score, color = 'red', linetype = 'dashed') +
scale_x_discrete(name = 'Cells') +
scale_y_continuous(name = 'Silhouette score') +
scale_fill_manual(legend_title, values = param$col_clusters) +
theme_bw() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
p
We next identify genes that are differentially expressed in one
cluster compared to all other clusters, based on RNA data and the method
“MAST”. Note that markers may bleed over between closely-related groups,
i.e. they are not forced to be specific to only one cluster.
Resulting p-values are adjusted using the Bonferroni method.
However, note that the p-values are likely inflated, since both
clusters and marker genes were determined based on the same gene
expression data, and there ought to be gene expression differences by
design. Nevertheless, p-values can be used to sort and
prioritize marker genes. We require marker genes to be expressed in at
least 25% of cells in the respective cluster, with a minimum log2 fold
change of 1 and adjusted p-value of at most 0.05. The results written to
file “markers_cluster_vs_rest.xlsx”.
As described above, cell clusters approximate cell types and states. But how do we know which cell types these are? To characterize cell clusters, we identify marker genes. Good marker genes are genes that are particularly expressed in one cluster, and existing knowledge of these marker genes can be used to extrapolate biological functions for the cluster. A good clustering of cells typically results in good marker genes. Hence, if you cannot find good marker genes you may need to go back to the start of the workflow and adapt your parameters. Note that we also determine genes that are particularly down-regulated in one cluster, even if these are not marker genes in the classical sense.
Good marker genes are highly and possibly even only expressed in one cluster as compared to all other clusters. However, sometimes marker genes are also expressed in other clusters, or are declared as marker genes in these clusters, for example cell lineage markers that are shared by different cell subtypes. To evaluate marker genes, it is essential to visualize their expression patterns.
In addition to detecting marker genes, it might be informative to detect genes that are differentially expressed between one specific cluster and one or several other clusters. This approach allows a more specific distinction of individual clusters and investigation of more subtle differences, see the section “Differentially expressed genes” below.# Find DEGs for every cluster compared to all remaining cells, report positive (=markers) and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Review recommends using "MAST"; Or "wilcox" or "LR"
# ALWAYS USE: assay="RNA"/"Spatial" or assay="SCT"
# DONT USE: assay=integrated datasets
# Note: By default, the function uses slot="data". Mast requires log data, so this is the correct way to do it.
# https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-interoperability.html
DefaultAssay(sc) = param$norm
if (param$norm=="SCT" & param$experimental_groups=="homogene") {
sc = PrepSCTFindMarkers(sc)
}
markers = Seurat::FindAllMarkers(sc, assay=param$norm, test.use="MAST", only.pos=FALSE,
min.pct=param$marker_pct, logfc.threshold=param$marker_log2FC,
latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE)
# If no markers were found, initialise the degs table so that further downstream (export) chunks run
if (ncol(markers)==0) markers = DegsEmptyMarkerResultsTable(levels(sc$seurat_clusters))
# For Seurat versions until 3.2, log fold change is based on the natural log. Convert to log base 2.
if ("avg_logFC" %in% colnames(markers) & !"avg_log2FC" %in% colnames(markers)) {
lfc_idx = grep("avg_log\\S*FC", colnames(markers))
markers[,lfc_idx] = marker_deg_results[,lfc_idx] / log(2)
col_nms = colnames(markers)
col_nms[2] = "avg_log2FC"
colnames(markers) = col_nms
}
# Sort markers
markers = markers %>% DegsSort(group=c("cluster"))
# Filter markers
markers_filt = DegsFilter(markers, cut_log2FC=param$marker_log2FC, cut_padj=param$marker_padj)
markers_found = nrow(markers_filt$all)>0
# Add average data to table
markers_out = cbind(markers_filt$all, DegsAvgDataPerIdentity(sc, genes=markers_filt$all$gene, assay=param$assay_raw))
# Split by cluster and write to file
additional_readme = data.frame(Column=c("cluster",
"p_val_adj_score",
"avg_<assay>_<slot>_id<cluster>"),
Description=c("Cluster",
"Score calculated as follows: -log10(p_val_adj)*sign(avg_log2FC)",
"Average expression value for cluster; <assay>: RNA or SCT; <slot>: raw counts or normalised data"))
# Create directory
if (!file.exists(file.path(param$path_out, "data", "marker_genes"))) dir.create(file.path(param$path_out, "data", "marker_genes"), recursive=TRUE, showWarnings=FALSE)
invisible(DegsWriteToFile(split(markers_out, markers_out$cluster),
annot_ensembl=annot_ensembl,
gene_to_ensembl=seurat_rowname_to_ensembl,
additional_readme=additional_readme,
file=file.path(param$path_out, "data", "marker_genes", "markers_cluster_vs_rest.xlsx")))
# Plot number of differentially expressed genes
p = DegsPlotNumbers(markers_filt$all,
group="cluster",
title=paste0("Number of DEGs, comparing each cluster to the rest\n(FC=", 2^param$marker_log2FC, ", adj. p-value=", param$marker_padj, ")"))
# Add marker table to seurat object
Seurat::Misc(sc, "markers") = list(condition_column="seurat_clusters", test="MAST", padj=param$marker_padj,
log2FC=param$marker_log2FC, min_pct=param$marker_pct, assay=param$assay_raw, slot="data",
latent_vars=param$latent_vars,
results=markers_filt$all,
enrichr=EmptyEnrichrDf(overlap_split=TRUE))
# Add marker lists to seurat object
marker_genesets_up = split(markers_filt$up$gene, markers_filt$up$cluster)
names(marker_genesets_up) = paste0("markers_up_cluster", names(marker_genesets_up))
marker_genesets_down = split(markers_filt$down$gene, markers_filt$down$cluster)
names(marker_genesets_down) = paste0("markers_down_cluster", names(marker_genesets_down))
sc = ScAddLists(sc, lists=c(marker_genesets_up, marker_genesets_down), lists_slot="gene_lists")
if (markers_found) {
p
} else {
warning("No differentially expressed genes (cluster vs rest) found. The following related code is not executed, no related plots and tables are generated.")
}We use the term “marker genes” to specifically describe genes that are up-regulated in cells of one cluster compared to the rest.
if (markers_found) {
markers_top = DegsUpDisplayTop(markers_filt$up, n=5)
markers_top_6 = DegsUpDisplayTop(markers_filt$up, n=6)
markers_top_10 = DegsUpDisplayTop(markers_filt$up, n=10)
# Add labels
markers_top$labels = paste0(markers_top$cluster, ": ", markers_top$gene)
# Show table
knitr::kable(markers_top %>% dplyr::select(-labels), align="l", caption="Up to top 5 marker genes per cell cluster") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="500px")
}| cluster | gene | avg_log2FC | p_val | p_val_adj | pct.1 | pct.2 |
|---|---|---|---|---|---|---|
| 1 | NRGN | 4.004 | 5.7e-72 | 3.7e-68 | 0.066 | 0.294 |
| 1 | RPS12 | 1.046 | 2.1e-71 | 1.3e-67 | 0.958 | 0.997 |
| 1 | PIK3IP1 | 2.142 | 2.3e-52 | 1.5e-48 | 0.774 | 0.297 |
| 1 | CD3D | 1.393 | 2.5e-52 | 1.6e-48 | 0.877 | 0.311 |
| 1 | CCR7 | 2.812 | 1.7e-50 | 1.1e-46 | 0.627 | 0.127 |
| 2 | CST3 | 2.611 | 3.3e-113 | 2.1e-109 | 1.000 | 0.223 |
| 2 | TGFBI | 2.480 | 1.7e-104 | 1.1e-100 | 0.945 | 0.155 |
| 2 | PSAP | 2.304 | 1.0e-103 | 6.4e-100 | 0.995 | 0.512 |
| 2 | KLF4 | 1.789 | 9.5e-94 | 6.1e-90 | 0.955 | 0.206 |
| 2 | CCDC88A | 2.564 | 1.3e-93 | 8.5e-90 | 0.920 | 0.164 |
| 3 | IGHM.1 | 8.981 | 1.5e-172 | 9.3e-169 | 0.870 | 0.024 |
| 3 | CD79B | 5.639 | 3.1e-170 | 2.0e-166 | 0.972 | 0.104 |
| 3 | LINC00926 | 8.034 | 1.5e-155 | 1.0e-151 | 0.881 | 0.014 |
| 3 | CD22 | 8.825 | 1.3e-144 | 8.5e-141 | 0.831 | 0.010 |
| 3 | HLA-DQA1 | 3.659 | 2.3e-125 | 1.5e-121 | 0.966 | 0.202 |
| 4 | S100A8 | 4.066 | 2.1e-132 | 1.4e-128 | 1.000 | 0.248 |
| 4 | S100A9 | 3.161 | 1.2e-109 | 7.8e-106 | 1.000 | 0.290 |
| 4 | MNDA | 2.918 | 4.1e-105 | 2.7e-101 | 0.988 | 0.219 |
| 4 | LYZ | 2.630 | 9.8e-100 | 6.3e-96 | 1.000 | 0.308 |
| 4 | FCN1 | 2.448 | 1.8e-94 | 1.2e-90 | 0.988 | 0.209 |
| 5 | CD3D | 1.552 | 5.6e-58 | 3.6e-54 | 0.993 | 0.335 |
| 5 | IL7R | 2.083 | 9.0e-56 | 5.8e-52 | 0.944 | 0.305 |
| 5 | TRAC | 1.684 | 9.5e-54 | 6.1e-50 | 0.986 | 0.385 |
| 5 | CD3G | 1.623 | 1.1e-45 | 6.8e-42 | 0.909 | 0.294 |
| 5 | CD2 | 1.908 | 5.0e-38 | 3.2e-34 | 0.811 | 0.262 |
| 6 | GZMK | 4.865 | 6.9e-87 | 4.4e-83 | 0.842 | 0.050 |
| 6 | GZMA | 2.783 | 3.5e-84 | 2.2e-80 | 0.950 | 0.105 |
| 6 | NKG7 | 2.258 | 9.4e-82 | 6.0e-78 | 0.967 | 0.135 |
| 6 | KLRG1 | 3.554 | 6.0e-78 | 3.9e-74 | 0.900 | 0.120 |
| 6 | CTSW | 2.375 | 1.1e-70 | 7.1e-67 | 0.983 | 0.215 |
| 7 | KLRD1 | 5.450 | 1.8e-74 | 1.2e-70 | 1.000 | 0.049 |
| 7 | KLRF1 | 6.473 | 4.5e-73 | 2.9e-69 | 0.930 | 0.035 |
| 7 | PRF1 | 5.087 | 2.6e-72 | 1.7e-68 | 1.000 | 0.126 |
| 7 | CTSW | 3.865 | 4.4e-67 | 2.8e-63 | 1.000 | 0.262 |
| 7 | NKG7 | 4.751 | 9.9e-67 | 6.4e-63 | 1.000 | 0.184 |
The following plots visualize the top marker genes for each cluster, respectively. Clear marker genes indicate good clusters that represent cell types.
# Note: We need to run this chunk as it specifies a variable that is used in chunk definitions below
if (markers_found) {
# Feature plots and violin plots: each row contains 3 plots
# The plot has 5 columns and 1 rows per cluster, hence the layout works nicely if we find
height_per_row = 2.5
nr_rows_5cols = ceiling(nrow(markers_top)/5)
fig_height_5cols = height_per_row * nr_rows_5cols
# Violin plots and violin plots: each row contains 3 plots
# The plot has 4 columns and 2 rows per cluster, hence the layout works nicely if we find
# at least 6 markers per cluster
height_per_row = 1.5
nr_rows_3cols = ceiling(nrow(markers_top_6)/3)
fig_height_3cols = height_per_row * nr_rows_3cols
# Dotplots: each row contains 2 plots
# The height of dotplots is dependent on the number of clusters
height_per_row_variable = max(3, 0.3 * length(levels(sc$seurat_clusters)))
nr_rows_dp_2cols = ceiling(length(levels(sc$seurat_clusters))/2)
fig_height_dp_2cols = height_per_row_variable * nr_rows_dp_2cols %>% min(100)
} else {
fig_height_5cols = nr_rows_3cols = fig_height_dp_2cols = 7
}if (markers_found) {
# Plot each marker one by one, and then combine them all at the end
p_list = list()
for (i in 1:nrow(markers_top)) {
p_list[[i]] = Seurat::FeaturePlot(sc, features=markers_top$gene[i],
cols=c("lightgrey", param$col_clusters[markers_top$cluster[i]]),
combine=TRUE, label=TRUE) +
AddStyle(title=markers_top$labels[i],
xlab="", ylab="",
legend_position="bottom")
}
# Combine all plots
p = patchwork::wrap_plots(p_list, ncol=5) +
patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data, top marker genes per cluster")
p
}if (markers_found) {
# Plot violin plots per marker gene, and combine it all at the end
# This layout works out nicely if there are 4 marker genes per cluster
p_list = list()
for(i in 1:nrow(markers_top_6)) {
p_list[[i]] = Seurat::VlnPlot(sc, features=markers_top_6$gene[i], assay=param$norm, pt.size=0, cols=param$col_clusters) +
AddStyle(title=markers_top_6$labels[i], xlab="") +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))
}
p = patchwork::wrap_plots(p_list, ncol=3) +
patchwork::plot_annotation(title="Violin plot of for normalised gene expression data, top marker genes per cluster") & theme(legend.position="none")
p
}if (markers_found) {
# Visualises how feature expression changes across different clusters
# Plot dotplots per cluster, and combine it all at the end
p_list = lapply(markers_top_10$cluster %>% sort() %>% unique(), function(cl) {
genes = markers_top_10 %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
p = suppressMessages(Seurat::DotPlot(sc, features=genes) +
scale_colour_gradient2(low="navy", mid="steelblue", high="darkgoldenrod1") +
AddStyle(title=paste0("Top marker genes in cluster ", cl), ylab="Cluster", xlab = "", legend_position="top") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
return(p)
})
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Dotplot with top 10 up-regulated marker genes (scaled)")
p
}
If the dataset contains multiple samples, we visualize the expression of marker genes that are up-regulated in a cluster separately for each sample.
#fig_height_degs_per_cl = max(5,
# max(2, 0.3 * (sc$orig.ident %>% unique() %>% length())) * length(levels(sc$seurat_clusters)) * 1) # We multiply by 2, as the legend requires quite some space
height_per_row_variable = max(3, 0.3 * (sc$orig.ident %>% unique() %>% length()))
nr_rows_dp_2cols = ceiling(length(levels(sc$seurat_clusters))/2)
fig_height_degs_per_cl = height_per_row_variable * nr_rows_dp_2cols %>% min(100)First, we plot normalized expression with no further scaling. This plot helps to get an impression of the total expression of a gene.
if (markers_found) {
n_genes_max_dotplot = 25
p_list = list()
for (cl in levels(sc$seurat_clusters)) {
markers_filt_up_cl_top = markers_filt$up %>%
dplyr::filter(cluster==cl) %>%
dplyr::top_n(n=n_genes_max_dotplot, wt=p_val_adj_score) %>%
dplyr::pull(gene)
if (length(markers_filt_up_cl_top) > 0) {
p_list[[cl]] = DotPlotUpdated(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident", scale=FALSE, cols=c("navy", "steelblue", "darkgoldenrod1")) +
AddStyle(title=paste0("Cluster ", cl), ylab="", xlab = "", legend_position="top") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1))
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Dotplot per cluster with top 25 up-regulated marker genes (not scaled)")
p
}Second, we plot scaled expression as explained above. This plot allows us to judge whether the expression of a gene is increased in one sample as compared to the other samples. However, for small sample numbers, this plot might be misleading as differences can appear inflated.
if (markers_found) {
p_list = list()
markers_filt_up_top = DegsUpDisplayTop(degs=markers_filt$up, n=25)
for (cl in levels(sc$seurat_clusters)) {
markers_filt_up_cl_top = markers_filt_up_top %>%
dplyr::filter(cluster==cl) %>%
dplyr::pull(gene)
if (length(markers_filt_up_cl_top) > 0) {
p_list[[cl]] = suppressMessages(Seurat::DotPlot(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident") +
scale_colour_gradient2(low="navy", mid="steelblue", high="darkgoldenrod1") +
#viridis::scale_color_viridis(option = "D") +
AddStyle(title=paste0("Cluster ", cl), ylab="", xlab = "", legend_position="top") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Dotplot per cluster with top 25 up-regulated marker genes (scaled)")
p
}To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms among up- and down-regulated genes of each cluster. We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (https://amp.pharm.mssm.edu/Enrichr/). You can choose to test functional enrichment from a wide range of databases. Over-represented terms are written to files “functions_marker_up_cluster_X_vs_rest.xlsx” and “functions_marker_down_cluster_X_vs_rest.xlsx”.
# Set Enrichr database
enrichR::setEnrichrSite(param$enrichr_site)
if (markers_found) {
# Create directory
if (!file.exists(file.path(param$path_out, "data", "marker_genes"))) dir.create(file.path(param$path_out, "data", "marker_genes"), recursive=TRUE, showWarnings=FALSE)
# Upregulated markers
# Convert Seurat names of upregulated marker per cluster to Entrez; use named lists for translation
# Is that still neccessary?
marker_genesets_up = sapply(levels(sc$seurat_clusters), function(x) {
tmp = markers_filt$up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
return(tmp[!is.na(tmp)])
}, USE.NAMES=TRUE, simplify=TRUE)
# Tests done by Enrichr
marker_genesets_up_enriched = purrr::map(marker_genesets_up, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
marker_genesets_up_enriched = purrr::map(list_names(marker_genesets_up_enriched), function(n) {
return(purrr::map(marker_genesets_up_enriched[[n]], function(d){
return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("up", nrow(d))))
}))
})
# Write to files
invisible(purrr::map(names(marker_genesets_up_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_up_enriched[[n]],
file=file.path(param$path_out, "data", "marker_genes", paste0("functions_marker_up_cluster_", n, "_vs_rest.xlsx")))
}))
# Downregulated markers
# Convert Seurat names of downregulated marker per cluster to Entrez; use named lists for translation
# Is that still neccessary?
marker_genesets_down = sapply(levels(sc$seurat_clusters), function(x) {
tmp = markers_filt$down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
return(tmp[!is.na(tmp)])
}, USE.NAMES=TRUE, simplify=TRUE)
# Tests done by Enrichr
marker_genesets_down_enriched = purrr::map(marker_genesets_down, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
marker_genesets_down_enriched = purrr::map(list_names(marker_genesets_down_enriched), function(n) {
return(purrr::map(marker_genesets_down_enriched[[n]], function(d){
return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("down", nrow(d))))
}))
})
# Write to files
invisible(purrr::map(names(marker_genesets_down_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_down_enriched[[n]],
file=file.path(param$path_out, "data", "marker_genes", paste0("functions_marker_down_cluster_", n, "_vs_rest.xlsx")))
}))
# Combine, flatten into data.frame and add to sc misc slot
marker_genesets_enriched = c(marker_genesets_up_enriched, marker_genesets_down_enriched)
marker_genesets_enriched = unname(marker_genesets_enriched)
marker_genesets_enriched = purrr::map(marker_genesets_enriched, FlattenEnrichr) %>% dplyr::bind_rows()
marker_genesets_enriched$Cluster = factor(marker_genesets_enriched$Cluster, levels=levels(sc$seurat_clusters))
marker_genesets_enriched$Direction = factor(marker_genesets_enriched$Direction, levels=c("up", "down"))
misc_content = Misc(sc, "markers")
misc_content[["enrichr"]] = marker_genesets_enriched
suppressWarnings({Misc(sc, "markers") = misc_content})
}The following table contains the top enriched terms per cluster and utilized database (GO_Biological_Process_2023, WikiPathway_2023_Human, Azimuth_2023, CellMarker_2024).
# Top enriched terms (TODO: better plots, functions)
if (markers_found) {
# Get top ten up and down over all databases per cluster
marker_genesets_top_enriched = marker_genesets_enriched %>% dplyr::group_by(Cluster, Database) %>% dplyr::filter(Direction=="up") %>%
dplyr::top_n(n=5, wt=Combined.Score)
# Print as tabs
enr_table = list()
#cat("### {.tabset} \n \n")
for(n in levels(marker_genesets_top_enriched$Cluster)){
#cat("#### ", n, " \n")
enr_table[[n]] = marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>%
dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score)
#enr_table = marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>%
# dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score)
#print(knitr::kable(enr_table, align="l", caption="Top ten enriched terms per geneset", format="html") %>%
#kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
#kableExtra::scroll_box(width="100%", height="700px"))
#cat(" \n \n")
}
#cat(" \n \n")
concat_enr_table = data.table::rbindlist(enr_table, use.names = TRUE, idcol = "Cluster")
knitr::kable(concat_enr_table, align="l", caption="Top ten enriched terms per geneset", format="html") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="500px")
}| Cluster | Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
|---|---|---|---|---|---|---|
| 1 | GO_Biological_Process_2023 | T Cell Activation (GO:0042110) | up | 0.0002176 | 24.649689 | 361.82784 |
| 1 | GO_Biological_Process_2023 | Immunological Synapse Formation (GO:0001771) | up | 0.0356235 | 159.544000 | 1417.98073 |
| 1 | WikiPathway_2023_Human | Modulators Of TCR Signaling And T Cell Activation WP5072 | up | 0.0429244 | 20.995778 | 158.11260 |
| 1 | WikiPathway_2023_Human | Pathogenesis Of SARS CoV 2 Mediated By Nsp9 Nsp10 Complex WP4884 | up | 0.0429244 | 41.955790 | 277.26135 |
| 1 | WikiPathway_2023_Human | T Cell Activation SARS CoV 2 WP5098 | up | 0.0429244 | 14.307083 | 92.49825 |
| 1 | WikiPathway_2023_Human | Cancer Immunotherapy By PD 1 Blockade WP4585 | up | 0.0429244 | 37.956190 | 243.88595 |
| 1 | Azimuth_2023 | Bone Marrow-L2-CD4 Naive | up | 0.0000000 | 867.173913 | 27391.85175 |
| 1 | Azimuth_2023 | PBMC-L2-CD4+ Naive T | up | 0.0000000 | 530.425532 | 13329.95772 |
| 1 | Azimuth_2023 | Bone Marrow-L1-CD4 T | up | 0.0000000 | 530.425532 | 13329.95772 |
| 1 | Azimuth_2023 | PBMC-L2-CD8+ Naive T | up | 0.0000001 | 332.383333 | 6348.60155 |
| 1 | Azimuth_2023 | Adipose-L1-T | up | 0.0000001 | 332.383333 | 6348.60155 |
| 1 | Azimuth_2023 | PBMC-L3-CD4+ Naive T | up | 0.0000001 | 332.383333 | 6348.60155 |
| 1 | Azimuth_2023 | Bone Marrow-L2-CD8 Naive | up | 0.0000001 | 332.383333 | 6348.60155 |
| 1 | Azimuth_2023 | Lung v1-L1-CD4 T | up | 0.0000001 | 332.383333 | 6348.60155 |
| 1 | CellMarker_2024 | Naive CD8+ T Cell Peripheral Blood Human | up | 0.0000000 | 133.150409 | 8832.83924 |
| 1 | CellMarker_2024 | Naive CD4+ T Cell Peripheral Blood Human | up | 0.0000000 | 148.903654 | 5318.49258 |
| 1 | CellMarker_2024 | T Cell Bone Marrow Human | up | 0.0000000 | 182.375163 | 5355.60983 |
| 1 | CellMarker_2024 | T Cell Stomach Human | up | 0.0000000 | 144.420290 | 3500.41482 |
| 1 | CellMarker_2024 | T Cell Lymphoid Tissue Human | up | 0.0000168 | 244.200000 | 3394.17410 |
| 2 | GO_Biological_Process_2023 | Immune Response-Inhibiting Cell Surface Receptor Signaling Pathway (GO:0002767) | up | 0.0012019 | 45.170901 | 502.71042 |
| 2 | GO_Biological_Process_2023 | Positive Regulation Of CD4-positive, CD25-positive, Alpha-Beta Regulatory T Cell Differentiation (GO:0032831) | up | 0.0044526 | 67.607143 | 622.50307 |
| 2 | GO_Biological_Process_2023 | Regulation Of Metallopeptidase Activity (GO:1905048) | up | 0.0071789 | 45.069124 | 384.47910 |
| 2 | GO_Biological_Process_2023 | Toll-Like Receptor 3 Signaling Pathway (GO:0034138) | up | 0.0071789 | 45.069124 | 384.47910 |
| 2 | GO_Biological_Process_2023 | Positive Regulation Of Lipopolysaccharide-Mediated Signaling Pathway (GO:0031666) | up | 0.0071789 | 45.069124 | 384.47910 |
| 2 | WikiPathway_2023_Human | TYROBP Causal Network In Microglia WP3945 | up | 0.0000034 | 11.244562 | 211.56945 |
| 2 | WikiPathway_2023_Human | Degradation Pathway Of Sphingolipids Including Diseases WP4153 | up | 0.0002658 | 24.744147 | 340.39054 |
| 2 | WikiPathway_2023_Human | IL1 And Megakaryocytes In Obesity WP2865 | up | 0.0122561 | 11.905458 | 105.08445 |
| 2 | WikiPathway_2023_Human | Activation Of NLRP3 Inflammasome By SARS CoV 2 WP4876 | up | 0.0243088 | 33.800115 | 269.98228 |
| 2 | WikiPathway_2023_Human | Macrophage Markers WP4146 | up | 0.0398347 | 22.531106 | 160.98031 |
| 2 | Azimuth_2023 | Adipose-L2-hMono1 | up | 0.0000000 | 159.217442 | 3704.48949 |
| 2 | Azimuth_2023 | Adipose-L1-Monocyte | up | 0.0000008 | 90.765661 | 1688.27078 |
| 2 | Azimuth_2023 | Liver-L2-Monocyte | up | 0.0000008 | 90.765661 | 1688.27078 |
| 2 | Azimuth_2023 | PBMC-L1-Monocyte | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | PBMC-L1-dendritic Cell | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | PBMC-L2-CD14+ Monocyte | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | PBMC-L3-CD14+ Monocyte | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | Fetal Development-L2-Pancreas-Myeloid Cells | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | Tonsil-L2-Neutrophil Granulocytes | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | Lung v1-L2-Nonclassical Monocyte | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | Adipose-L2-hMono2 | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | Azimuth_2023 | Liver-L1-Myeloid | up | 0.0000141 | 56.594329 | 813.63816 |
| 2 | CellMarker_2024 | Monocyte Fetal Kidney Human | up | 0.0000000 | 17.588173 | 4631.13519 |
| 2 | CellMarker_2024 | CD1C-CD141- Dendritic Cell Blood Human | up | 0.0000000 | 20.532637 | 3310.47008 |
| 2 | CellMarker_2024 | CD1C+ B Dendritic Cell Blood Human | up | 0.0000000 | 32.355572 | 1650.02841 |
| 2 | CellMarker_2024 | Mononuclear Phagocyte Sputum Human | up | 0.0000000 | 106.139535 | 2343.75702 |
| 2 | CellMarker_2024 | Monocyte Undefined Human | up | 0.0000000 | 272.324826 | 5731.89220 |
| 3 | GO_Biological_Process_2023 | Immunoglobulin Mediated Immune Response (GO:0016064) | up | 0.0000009 | 48.710379 | 1020.91678 |
| 3 | GO_Biological_Process_2023 | Immunoglobulin Production Involved In Immunoglobulin-Mediated Immune Response (GO:0002381) | up | 0.0000332 | 56.278345 | 896.55833 |
| 3 | GO_Biological_Process_2023 | MHC Class II Protein Complex Assembly (GO:0002399) | up | 0.0001672 | 62.548032 | 831.97054 |
| 3 | GO_Biological_Process_2023 | Peptide Antigen Assembly With MHC Class II Protein Complex (GO:0002503) | up | 0.0001672 | 62.548032 | 831.97054 |
| 3 | GO_Biological_Process_2023 | Positive Regulation Of CD4-positive, CD25-positive, Alpha-Beta Regulatory T Cell Differentiation (GO:0032831) | up | 0.0171230 | 102.666667 | 798.18816 |
| 3 | WikiPathway_2023_Human | B Cell Receptor Signaling Pathway WP23 | up | 0.0000351 | 15.319396 | 239.69336 |
| 3 | WikiPathway_2023_Human | Allograft Rejection WP2328 | up | 0.0002293 | 13.457248 | 175.97467 |
| 3 | WikiPathway_2023_Human | Ebola Virus Infection In Host WP4217 | up | 0.0016554 | 9.137295 | 97.71566 |
| 3 | WikiPathway_2023_Human | Prion Disease Pathway WP3995 | up | 0.0466664 | 16.034483 | 108.03438 |
| 3 | WikiPathway_2023_Human | Genetic Causes Of Porto Sinusoidal Vascular Disease WP5269 | up | 0.0496391 | 13.673024 | 86.31243 |
| 3 | Azimuth_2023 | PBMC-L1-B Cell | up | 0.0000000 | 197.073413 | 4020.91851 |
| 3 | Azimuth_2023 | PBMC-L1-dendritic Cell | up | 0.0000000 | 197.073413 | 4020.91851 |
| 3 | Azimuth_2023 | Adipose-L2-hcDC2 | up | 0.0000000 | 197.073413 | 4020.91851 |
| 3 | Azimuth_2023 | Liver-L1-B | up | 0.0000000 | 197.073413 | 4020.91851 |
| 3 | Azimuth_2023 | Liver-L2-B | up | 0.0000000 | 197.073413 | 4020.91851 |
| 3 | Azimuth_2023 | Bone Marrow-L2-Naive B | up | 0.0000000 | 197.073413 | 4020.91851 |
| 3 | CellMarker_2024 | B Cell Kidney Human | up | 0.0000000 | 61.235433 | 13077.25267 |
| 3 | CellMarker_2024 | B Cell Skin Mouse | up | 0.0000000 | 97.642623 | 3078.98611 |
| 3 | CellMarker_2024 | B Cell Lung Human | up | 0.0000000 | 158.904000 | 3732.48288 |
| 3 | CellMarker_2024 | Follicular B Cell Spleen Human | up | 0.0000000 | 394.186508 | 8744.79847 |
| 3 | CellMarker_2024 | B Cell Lung Mouse | up | 0.0000464 | 232.816406 | 2983.51552 |
| 3 | CellMarker_2024 | Naive B Cell Spleen Human | up | 0.0000464 | 232.816406 | 2983.51552 |
| 4 | GO_Biological_Process_2023 | Vacuolar Acidification (GO:0007035) | up | 0.0001630 | 18.153938 | 261.97013 |
| 4 | GO_Biological_Process_2023 | Leukocyte Aggregation (GO:0070486) | up | 0.0006864 | 75.488953 | 956.76985 |
| 4 | GO_Biological_Process_2023 | Lysosomal Lumen Acidification (GO:0007042) | up | 0.0013799 | 21.827479 | 248.27203 |
| 4 | GO_Biological_Process_2023 | Regulation Of Leukocyte Degranulation (GO:0043300) | up | 0.0087622 | 42.338362 | 365.53648 |
| 4 | GO_Biological_Process_2023 | p38MAPK Cascade (GO:0038066) | up | 0.0116901 | 33.868966 | 276.93909 |
| 4 | WikiPathway_2023_Human | TYROBP Causal Network In Microglia WP3945 | up | 0.0000003 | 14.159292 | 301.38896 |
| 4 | WikiPathway_2023_Human | Degradation Pathway Of Sphingolipids Including Diseases WP4153 | up | 0.0013215 | 23.647640 | 276.32871 |
| 4 | WikiPathway_2023_Human | Activation Of NLRP3 Inflammasome By SARS CoV 2 WP4876 | up | 0.0064493 | 42.338362 | 365.53648 |
| 4 | WikiPathway_2023_Human | Macrophage Markers WP4146 | up | 0.0099703 | 28.222701 | 219.69663 |
| 4 | WikiPathway_2023_Human | COVID 19 And Endothelial Cell Senescence WP5256 | up | 0.0268984 | 37.528176 | 218.44006 |
| 4 | Azimuth_2023 | PBMC-L2-CD14+ Monocyte | up | 0.0000001 | 113.889855 | 2267.82956 |
| 4 | Azimuth_2023 | PBMC-L3-CD14+ Monocyte | up | 0.0000001 | 113.889855 | 2267.82956 |
| 4 | Azimuth_2023 | Adipose-L2-hMono1 | up | 0.0000001 | 113.889855 | 2267.82956 |
| 4 | Azimuth_2023 | Liver-L2-Monocyte | up | 0.0000001 | 113.889855 | 2267.82956 |
| 4 | Azimuth_2023 | Bone Marrow-L2-CD14 Monocyte | up | 0.0000001 | 113.889855 | 2267.82956 |
| 4 | CellMarker_2024 | Monocyte Fetal Kidney Human | up | 0.0000000 | 27.805576 | 9065.47514 |
| 4 | CellMarker_2024 | CD1C-CD141- Dendritic Cell Blood Human | up | 0.0000000 | 15.686061 | 1609.52197 |
| 4 | CellMarker_2024 | CD1C+ B Dendritic Cell Blood Human | up | 0.0000000 | 51.882801 | 3493.23744 |
| 4 | CellMarker_2024 | Macrophage Rectum Human | up | 0.0000001 | 113.889855 | 2267.82956 |
| 4 | CellMarker_2024 | Monocyte Undefined Human | up | 0.0000011 | 141.958092 | 2445.41040 |
| 5 | GO_Biological_Process_2023 | T Cell Activation (GO:0042110) | up | 0.0000000 | 34.321799 | 820.11926 |
| 5 | GO_Biological_Process_2023 | T-helper Cell Differentiation (GO:0042093) | up | 0.0020081 | 69.912281 | 755.45170 |
| 5 | GO_Biological_Process_2023 | Negative Regulation Of Phosphatidylinositol 3-Kinase Signaling (GO:0014067) | up | 0.0108912 | 98.192118 | 791.92876 |
| 5 | GO_Biological_Process_2023 | Regulation Of Transmembrane Transporter Activity (GO:0022898) | up | 0.0108912 | 98.192118 | 791.92876 |
| 5 | GO_Biological_Process_2023 | Response To Interleukin-4 (GO:0070670) | up | 0.0108912 | 98.192118 | 791.92876 |
| 5 | WikiPathway_2023_Human | Modulators Of TCR Signaling And T Cell Activation WP5072 | up | 0.0000056 | 40.171717 | 699.28370 |
| 5 | WikiPathway_2023_Human | Th17 Cell Differentiation Pathway WP5130 | up | 0.0002028 | 28.232955 | 370.64784 |
| 5 | WikiPathway_2023_Human | Pathogenesis Of SARS CoV 2 Mediated By Nsp9 Nsp10 Complex WP4884 | up | 0.0016753 | 58.251462 | 601.36439 |
| 5 | WikiPathway_2023_Human | Cancer Immunotherapy By PD 1 Blockade WP4585 | up | 0.0017770 | 52.421053 | 526.38584 |
| 5 | WikiPathway_2023_Human | T Cell Receptor And Co Stimulatory Signaling WP2583 | up | 0.0018068 | 41.926316 | 395.66236 |
| 5 | Azimuth_2023 | Bone Marrow-L1-CD4 T | up | 0.0000000 | 738.407407 | 22660.76312 |
| 5 | Azimuth_2023 | Bone Marrow-L2-CD8 Memory | up | 0.0000000 | 453.090909 | 11050.61134 |
| 5 | Azimuth_2023 | Lung v1-L1-CD4 T | up | 0.0000000 | 453.090909 | 11050.61134 |
| 5 | Azimuth_2023 | PBMC-L2-CD4+ Naive T | up | 0.0000001 | 284.785714 | 5272.34315 |
| 5 | Azimuth_2023 | Adipose-L1-T | up | 0.0000001 | 284.785714 | 5272.34315 |
| 5 | Azimuth_2023 | PBMC-L3-CD4+ Central Memory T 2 | up | 0.0000001 | 284.785714 | 5272.34315 |
| 5 | Azimuth_2023 | Bone Marrow-L1-Other T | up | 0.0000001 | 284.785714 | 5272.34315 |
| 5 | Azimuth_2023 | Bone Marrow-L2-CD4 Memory | up | 0.0000001 | 284.785714 | 5272.34315 |
| 5 | Azimuth_2023 | Bone Marrow-L2-CD4 Naive | up | 0.0000001 | 284.785714 | 5272.34315 |
| 5 | Azimuth_2023 | Bone Marrow-L2-Mucosal Associated Invariant T | up | 0.0000001 | 284.785714 | 5272.34315 |
| 5 | CellMarker_2024 | T Cell Bone Marrow Human | up | 0.0000000 | 191.576923 | 6415.61849 |
| 5 | CellMarker_2024 | Naive T(Th0) Cell Colorectum Human | up | 0.0000000 | 712.071429 | 14693.53180 |
| 5 | CellMarker_2024 | Naive CD4+ T Cell Blood Human | up | 0.0000001 | 356.000000 | 6799.21697 |
| 5 | CellMarker_2024 | T Cell Lymphoid Tissue Human | up | 0.0000001 | 356.000000 | 6799.21697 |
| 5 | CellMarker_2024 | T Cell Lymphoid Tissue Mouse | up | 0.0000040 | 524.684210 | 7964.59750 |
| 6 | GO_Biological_Process_2023 | T Cell Activation (GO:0042110) | up | 0.0000006 | 23.951652 | 503.20780 |
| 6 | GO_Biological_Process_2023 | Immunological Synapse Formation (GO:0001771) | up | 0.0004239 | 189.056962 | 2454.66207 |
| 6 | GO_Biological_Process_2023 | Alpha-Beta T Cell Activation (GO:0046631) | up | 0.0035876 | 62.993671 | 657.80825 |
| 6 | GO_Biological_Process_2023 | Cellular Response To Copper Ion (GO:0071280) | up | 0.0057912 | 44.454952 | 424.04507 |
| 6 | GO_Biological_Process_2023 | Positive Regulation Of Antigen Receptor-Mediated Signaling Pathway (GO:0050857) | up | 0.0057912 | 44.454952 | 424.04507 |
| 6 | GO_Biological_Process_2023 | Lens Fiber Cell Differentiation (GO:0070306) | up | 0.0385587 | 62.218750 | 449.24808 |
| 6 | WikiPathway_2023_Human | Pathogenesis Of SARS CoV 2 Mediated By Nsp9 Nsp10 Complex WP4884 | up | 0.0000020 | 80.771104 | 1432.74689 |
| 6 | WikiPathway_2023_Human | Cancer Immunotherapy By PD 1 Blockade WP4585 | up | 0.0000020 | 71.789322 | 1237.77880 |
| 6 | WikiPathway_2023_Human | Modulators Of TCR Signaling And T Cell Activation WP5072 | up | 0.0000076 | 28.511483 | 442.33145 |
| 6 | WikiPathway_2023_Human | T Cell Receptor Signaling Pathway WP69 | up | 0.0000196 | 22.710526 | 324.11319 |
| 6 | WikiPathway_2023_Human | T Cell Receptor And Co Stimulatory Signaling WP2583 | up | 0.0001236 | 42.508547 | 518.98916 |
| 6 | Azimuth_2023 | Liver-L2-CD8 T | up | 0.0000000 | 929.413333 | 32681.07141 |
| 6 | Azimuth_2023 | PBMC-L3-Mucosal Associated Invariant T | up | 0.0000000 | 524.078947 | 15065.37560 |
| 6 | Azimuth_2023 | Liver-L1-T/NK | up | 0.0000000 | 524.078947 | 15065.37560 |
| 6 | Azimuth_2023 | Bone Marrow-L2-Mucosal Associated Invariant T | up | 0.0000000 | 524.078947 | 15065.37560 |
| 6 | Azimuth_2023 | Lung v1-L1-CD8 T | up | 0.0000000 | 524.078947 | 15065.37560 |
| 6 | CellMarker_2024 | T Cell Bone Marrow Human | up | 0.0000000 | 284.371429 | 14796.74226 |
| 6 | CellMarker_2024 | CD8+ T Cell Undefined Human | up | 0.0000000 | 306.832192 | 12228.05918 |
| 6 | CellMarker_2024 | Naive CD8+ T Cell Bile Duct Human | up | 0.0000000 | 646.623377 | 15887.39418 |
| 6 | CellMarker_2024 | Cytotoxic T Cell Peripheral Blood Human | up | 0.0000000 | 646.623377 | 15887.39418 |
| 6 | CellMarker_2024 | Memory CD8+ T Cell Peripheral Blood Human | up | 0.0000000 | 1021.384615 | 20892.31460 |
| 7 | GO_Biological_Process_2023 | Natural Killer Cell Mediated Immunity (GO:0002228) | up | 0.0053425 | 23.303803 | 276.57638 |
| 7 | GO_Biological_Process_2023 | Negative Regulation Of Natural Killer Cell Mediated Cytotoxicity (GO:0045953) | up | 0.0142139 | 26.312000 | 265.80056 |
| 7 | GO_Biological_Process_2023 | Type II Interferon-Mediated Signaling Pathway (GO:0060333) | up | 0.0267576 | 39.322709 | 343.52501 |
| 7 | GO_Biological_Process_2023 | T-helper Cell Lineage Commitment (GO:0002295) | up | 0.0267576 | 33.703472 | 282.73146 |
| 7 | GO_Biological_Process_2023 | Negative Regulation Of Natural Killer Cell Mediated Immunity (GO:0002716) | up | 0.0267576 | 33.703472 | 282.73146 |
| 7 | WikiPathway_2023_Human | Proteasome Degradation WP183 | up | 0.0000339 | 13.649365 | 222.35594 |
| 7 | WikiPathway_2023_Human | Interactions Of Natural Killer Cells In Pancreatic Cancer WP5092 | up | 0.0002375 | 21.690616 | 296.10146 |
| 7 | WikiPathway_2023_Human | Cancer Immunotherapy By PD 1 Blockade WP4585 | up | 0.0009085 | 22.008032 | 256.03162 |
| 7 | WikiPathway_2023_Human | Iron Sulfur Cluster Biogenesis WP5152 | up | 0.0255013 | 19.655379 | 139.61244 |
| 7 | WikiPathway_2023_Human | TCA Cycle Nutrient Use And Invasiveness Of Ovarian Cancer WP2868 | up | 0.0343871 | 52.230159 | 337.35040 |
| 7 | Azimuth_2023 | Liver-L2-NK | up | 0.0000000 | 279.773279 | 7577.37452 |
| 7 | Azimuth_2023 | Bone Marrow-L2-Innate Lymphoid Cell | up | 0.0000000 | 159.217742 | 3480.04251 |
| 7 | Azimuth_2023 | Lung V2 (HLCA)-ann Level 4-NK Cells | up | 0.0000011 | 99.106426 | 1692.28584 |
| 7 | Azimuth_2023 | PBMC-L2-gamma-delta T | up | 0.0000011 | 99.106426 | 1692.28584 |
| 7 | Azimuth_2023 | Lung V2 (HLCA)-ann Finest level-NK Cells | up | 0.0000011 | 99.106426 | 1692.28584 |
| 7 | Azimuth_2023 | PBMC-L3-CD8+ Effector Memory T 2 | up | 0.0000011 | 99.106426 | 1692.28584 |
| 7 | Azimuth_2023 | Lung v1-L1-Natural Killer | up | 0.0000011 | 99.106426 | 1692.28584 |
| 7 | Azimuth_2023 | Lung V2 (HLCA)-ann Level 3-Innate Lymphoid Cell NK | up | 0.0000011 | 99.106426 | 1692.28584 |
| 7 | CellMarker_2024 | Cytotoxic CD4+ T Cell Liver Human | up | 0.0000000 | 40.374905 | 2793.55992 |
| 7 | CellMarker_2024 | Cytotoxic Cell Stomach Human | up | 0.0000000 | 90.633674 | 2667.11335 |
| 7 | CellMarker_2024 | Cytotoxic T Cell Colorectum Human | up | 0.0000002 | 198.232932 | 3735.97305 |
| 7 | CellMarker_2024 | Cytotoxic T Cell Brain Human | up | 0.0000034 | 315.920000 | 5019.58323 |
| 7 | CellMarker_2024 | T helper(Th) Cell Brain Mouse | up | 0.0000079 | 157.952000 | 2337.72120 |
Single cell transcriptomes can be difficult to annotate without extensive knowledge of the underlying biology. Given a reference dataset (of samples from single-cell or bulk RNA sequencing) with known labels, it is possible to assign labels to the cells from a test dataset based on similarity to that reference. Hence, the biological knowledge (defined marker genes and cluster identities) can be propagated from the reference dataset to the test dataset in an automated manner and aid in cluster identification.
Here, we performed cell and cluster annotation with cell types from the reference dataset BlueprintEncodeData() obtained from celldex (https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html). The celldex package provides access to several reference datasets (mostly derived from bulk RNA-seq or microarray data).
# Required libraries
library(SingleR)
library(celldex)
# Load reference datasets
# Reference dataset obtained from celldex for singleR
# https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html
# Function to paste reference name and download reference dataset
str_eval=function(x) {return(eval(parse(text=x)))}
ref = str_eval(param$annotation_dbs)
ref_name = gsub("[()]","",param$annotation_dbs)# Annotate cells and clusters using SingleR with reference dataset form celldex
sce_ann_cells = SingleR::SingleR(test = GetAssayData(sc, assay = param$norm, slot = 'data'),
ref = ref, assay.type.test = 1, labels = ref@colData@listData$label.main)
sce_ann_clusters = SingleR::SingleR(test = GetAssayData(sc, assay = param$norm, slot = 'data'),
ref = ref, assay.type.test = 1, labels = ref@colData@listData$label.main, clusters = sc$seurat_clusters)
annotated_cells = table(sce_ann_cells$labels)
annotated_clusters = table(sce_ann_clusters$labels)
sc[["SingleR.labels"]] = sce_ann_cells$labels
sc[["SingleR.cluster.labels"]] = sce_ann_clusters$labels[match(sc[[]][["seurat_clusters"]], rownames(sce_ann_clusters))]UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cell. The annotation of each cell is more sensitive, but also more prone to artefacts compared to the annotation of clusters as performed in later steps. Here, we perform annotation of single cells for annotation diagnostics, that means for assessment of the reliability of cell type annotation and how close all cells resemble the cell types of the reference dataset.
# Print table with cell annotation
knitr::kable(annotated_cells, align="l", caption="Cell types assigned to cells", format="html") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px")| Var1 | Freq |
|---|---|
| B-cells | 180 |
| CD4+ T-cells | 218 |
| CD8+ T-cells | 232 |
| Erythrocytes | 5 |
| HSC | 7 |
| Mesangial cells | 1 |
| Monocytes | 368 |
| Neutrophils | 1 |
| NK cells | 66 |
# Visualization of clusters
p_umap = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) +
scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "bottom") +
theme(title = element_text(size = 10)) +
NoGrid() +
NoLegend()
p_umap = LabelClusters(p_umap, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
# Visualization of singleR annotation of cells
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.labels", pt.size=param$pt_size) +
AddStyle(title="Cell types \n(annotation per cell)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="bottom", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
p = p_umap + p2
pHeatmap displays the scores for all cells across all reference labels, allowing to assess the confidence of the corresponding predicted labels. Ideally, each cell should have one score that is distinctively higher than the rest, indicating that an unambiguous assignment.
# Annotation diagnostics
# https://bioconductor.org/packages/devel/bioc/vignettes/SingleR/inst/doc/SingleR.html
p = plotScoreHeatmap(sce_ann_cells)
p
Plot displaying per-cell “deltas” (the difference between the score for the assigned label and the median across all labels). Low deltas indicate that the assignment is uncertain. The minimum threshold on the deltas is defined using an outlier-based approach. Yellow marked points represents outliers that fell below the threshold.
# Set figure height for diagnostic plot
number_fig_dig = length(unique(sce_ann_cells$pruned.labels))
fig_height_dig = ceiling(number_fig_dig/6)*3# Annotation diagnostics
p = plotDeltaDistribution(sce_ann_cells, ncol = 6) +
NoLegend()
pnumber_pruned_table = table(is.na(sce_ann_cells$pruned.labels))
number_pruned_table[3]=round((number_pruned_table[1]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
number_pruned_table[4]=round((number_pruned_table[2]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
names(number_pruned_table) = c("assigned", "ambiguous", "% assigned", "% ambiguous")
number_pruned_table = (t(number_pruned_table))
knitr::kable(number_pruned_table, align="l", caption="Number of annotated cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) | assigned | ambiguous | % assigned | % ambiguous |
|---|---|---|---|
| 1043 | 35 | 96.75 | 3.25 |
UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cluster as unit.
# Set to annotation metadata column
sc$annotation = sc$SingleR.cluster.labels
# Set colors for cell types based on annotation
if (!is.null(sc@meta.data[["annotation"]])) {
if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation))
sc = annotation_colours_out[[1]]
param$col_annotation = annotation_colours_out[[2]]
}
}
# Print table with cell annotation
knitr::kable(annotated_clusters, align="l", caption="Cell types assigned to clusters", format="html") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px")| Var1 | Freq |
|---|---|
| B-cells | 1 |
| CD4+ T-cells | 1 |
| CD8+ T-cells | 2 |
| Monocytes | 2 |
| NK cells | 1 |
# Visualization of singleR annotation of clusters
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="annotation", cols = param$col_annotation, pt.size=param$pt_size) +
AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
p2$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p2$data), ]
p2 = LabelClusters(p2, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p = p_umap + p2
p# Visualization of singleR annotation of clusters separately per sample
p = Seurat::DimPlot(sc, reduction="umap", group.by="annotation", split.by = "orig.ident", cols = param$col_annotation, pt.size=param$pt_size) +
AddStyle(title="", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
theme(title = element_text(size = 10)) +
NoGrid()
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p
# Set
cell_samples = levels(sc$orig.ident)
cell_celltype = unique(sc$annotation)
# Make count table
tbl = dplyr::count(sc[[c("orig.ident", "annotation")]], orig.ident, annotation) %>% tidyr::pivot_wider(names_from="annotation", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"])
tbl[,"orig.ident"] = NULL
tbl_bar = tbl[cell_samples, , drop=FALSE] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(cols = tidyselect::all_of(cell_celltype), names_to="Celltype", values_to="Counts")
tbl_bar$Counts = as.numeric(tbl_bar$Counts)
tbl_bar$Celltype <- factor(tbl_bar$Celltype,levels = cell_celltype)
tbl_bar$Sample <- factor(tbl_bar$Sample, levels = cell_samples)
# Plot counts
p1 = ggplot(tbl_bar, aes(x=Celltype, y=Counts, fill=Sample)) +
geom_bar(stat="identity" ) +
AddStyle(title="",
fill=param$col_samples,
legend_title="Sample",
legend_position="bottom") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Make table with percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t() %>% as.data.frame()
tbl_bar_perc = tbl_perc[cell_samples, , drop=FALSE] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(cols = tidyselect::all_of(cell_celltype), names_to="Celltype", values_to="Percentage")
tbl_bar_perc$Percentage = as.numeric(tbl_bar_perc$Percentage)
tbl_bar_perc$Celltype <- factor(tbl_bar_perc$Celltype,levels = cell_celltype)
tbl_bar_perc$Sample <- factor(tbl_bar_perc$Sample, levels = cell_samples)
# Plot percentages
p2 = ggplot(tbl_bar_perc, aes(x=Celltype, y=Percentage, fill=Sample)) +
geom_bar(stat="identity" ) +
AddStyle(title="",
fill=param$col_samples,
legend_title="Sample",
legend_position="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p = p1 + p2
p# Plot counts
p1 = ggplot(tbl_bar, aes(x=Sample, y=Counts, fill=Celltype)) +
geom_bar(stat="identity" ) +
AddStyle(title="", xlab = "",
fill=param$col_annotation,
legend_title="Sample",
legend_position="bottom") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Calculate percentages
tbl_perc = round(t(tbl) / colSums(t(tbl)) * 100, 2) %>% t() %>% as.data.frame()
tbl_perc = round(tbl / rowSums(tbl) * 100, 2) %>% as.data.frame()
# Plot percentages
tbl_bar_perc = tbl_perc[cell_samples, , drop=FALSE] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(cols = tidyselect::all_of(cell_celltype), names_to="Cell_type", values_to="Percentage")
tbl_bar_perc$Percentage = as.numeric(tbl_bar_perc$Percentage)
tbl_bar_perc$Cell_type <- factor(tbl_bar_perc$Cell_type,levels = cell_celltype)
tbl_bar_perc$Sample <- factor(tbl_bar_perc$Sample, levels = cell_samples)
p2 = ggplot(tbl_bar_perc, aes(x=Sample, y=Percentage, fill=Cell_type)) +
geom_bar(stat="identity" ) +
AddStyle(title="", xlab = "",
fill=param$col_annotation,
legend_position="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p = p1 + p2
pp = ggplot(data = tbl_bar_perc, aes(x = Cell_type, y = Sample)) +
geom_point(aes(color = Percentage, size = ifelse(Percentage==0, NA, Percentage))) +
labs(col="Percentage", size="") +
scale_colour_gradient2(low="navy", mid="steelblue", high="darkgoldenrod1") +
#viridis::scale_color_viridis(option = "D") +
AddStyle(title="", ylab="Sample", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1))
p# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))
# Add parameter
Seurat::Misc(sc, "parameters") = param
# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))### Export seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))### Convert data
################################################################################
### Export to Loupe Cell Browser
if (all(param$path_data$type == "10x")) {
# Export reductions (umap, pca, others)
for(r in Seurat::Reductions(sc)) {
write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
file=file.path(param$path_out, "data", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
# Export categorical metadata
loupe_meta = sc@meta.data
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"),
file=file.path(param$path_out, "data", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
### Export as andata object (h5ad file)
# Convert Seurat v5 single cell object to anndata object
# Does not work for SCT at the moment
# However, it would contain counts and data layer
#scCustomize::as.anndata(x = sc, file_path = file.path(param$path_out, "data"), file_name = "sc_anndata.h5ad", assay="RNA",
# main_layer = "data", other_layers = "counts", transer_dimreduc = TRUE, verbose = FALSE)
# scesy only worked for v3 and v4 objects
# Convert to V3/4/Assay structure
sc_v3 = scCustomize::Convert_Assay(seurat_object = sc, convert_to = "V3", assay = "RNA")
# Convert Seurat v3 single cell object to anndata object
adata = sceasy::convertFormat(sc_v3, from="seurat", to="anndata", outFile=NULL, assay=DefaultAssay(sc))
# Write to h5ad file
adata$write(file.path(param$path_out, "data", "sc_anndata.h5ad"), compression="gzip")
### Export count matrix
invisible(ExportSeuratAssayData(sc,
dir=file.path(param$path_out, "data", "counts"),
assays=param$assay_raw,
slot="counts",
include_cell_metadata_cols=colnames(sc[[]]),
metadata_prefix=paste0(param$project_id, ".")))
### Export metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)We export the assay data, cell metadata, clustering and visualization.
Result files:
* sc.rds: Seurat object for import into R
* cell_metadata.xlsx: Cell metadata in excel format
* counts folder: Contains raw count matrix of the entire dataset (sparse
matrix)
We export the assay data, cell metadata, clustering and visualization in andata format.
Result file:
* sc.h5ad: H5AD object
If all provided datasets are of type “10x”, we export the UMAP 2D visualization, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser).
Result files are:
* Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… projections for
visualization
* Loupe_metadata.csv: Seurat cell meta data including clusters and cell
cycle phases
Projections can be imported in Loupe via “Import Projection”, cell meta data via “Import Categories” and gene sets via “Import Lists”.
The following parameters were used to run the workflow.
out = ScrnaseqParamsInfo(params=param)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")| Name | Value |
|---|---|
| path_to_git | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis |
| path_to_basic_settings | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/basic_settings.R |
| path_to_advanced_settings | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/advanced_settings.R |
| scriptname | scripts/pre-processing/cluster_analysis.Rmd |
| author | kosankem |
| assay_raw | RNA |
| downsample_cells_equally | FALSE |
| cell_filter | nFeature_RNA:20, NA; nCount_RNA:200, 20000; percent_mt:0, 18; Sample1:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18); Sample2:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18) |
| feature_filter | min_counts:1; min_cells:5; Sample1:min_counts=1, min_cells=5; Sample2:min_counts=1, min_cells=5 |
| samples_min_cells | 10 |
| norm | RNA |
| cc_remove | FALSE |
| cc_remove_all | FALSE |
| cc_rescore_after_merge | TRUE |
| integrate_samples | method:merge; dimensions:30; k_anchor:10; k_weight:100; integration_function:CCAIntegration |
| experimental_groups | homogene |
| pc_n | 8 |
| cluster_k | 20 |
| umap_k | 30 |
| cluster_resolution_test | 0.5, 0.7, 0.8 |
| cluster_resolution | 0.6 |
| marker_padj | 0.05 |
| marker_log2FC | 1 |
| marker_pct | 0.25 |
| enrichr_site | Enrichr |
| enrichr_padj | 0.05 |
| col_palette_samples | ggsci::pal_igv |
| col_palette_clusters | ggsci::pal_igv |
| col_palette_annotation | ggsci::pal_ucscgb |
| col | #0086b3 |
| col_bg | #D3D3D3 |
| pt_size | 0.5 |
| project_id | Testdata |
| data | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/pre-processing/data/sc.rds |
| path_data | name:Sample1, Sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/ |
| species | human |
| path_out | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/cluster_analysis |
| debugging_mode | default_debugging |
| mart_dataset | hsapiens_gene_ensembl |
| mt | ^MT- |
| enrichr_dbs | GO_Biological_Process_2023, WikiPathway_2023_Human, Azimuth_2023, CellMarker_2024 |
| annotation_dbs | BlueprintEncodeData() |
| annot_version | 98 |
| annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
| mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
| path_reference | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98 |
| reference | hsapiens_gene_ensembl.v98.annot.txt |
| file_annot | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt |
| file_cc_genes | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx |
| col_samples | Sample1=#5050FFFF, Sample2=#CE3D32FF |
| col_clusters | 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF |
| col_annotation | Monocytes=#FF0000FF, CD8+ T-cells=#FF9900FF, CD4+ T-cells=#FFCC00FF, B-cells=#00FF00FF, NK cells=#6699FFFF |
This report was generated using the scripts/pre-processing/cluster_analysis.Rmd script. Software versions were collected at run time.
out = ScrnaseqSessionInfo(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| Name | Value |
|---|---|
| Run on: | Tue Sep 03 04:34:13 PM 2024 |
| ktrns/scrnaseq | 38ca32c4ba344d783cf79821eddac5495563f985 |
| Container | NA |
| R | R version 4.2.1 (2022-06-23) |
| Platform | x86_64-pc-linux-gnu (64-bit) |
| Operating system | Ubuntu 20.04.6 LTS |
| Host name | hpc-rc11 |
| Host OS | #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic) |
| Packages | abind1.4-5, AnnotationDbi1.60.2, AnnotationHub3.6.0, ape5.8, assertthat0.2.1, backports1.4.1, beachmat2.14.2, beeswarm0.4.0, bibtex0.5.1, Biobase2.58.0, BiocFileCache2.6.1, BiocGenerics0.44.0, BiocManager1.30.22, BiocParallel1.32.6, BiocSingular1.14.0, BiocVersion3.16.0, Biostrings2.66.0, bit4.0.5, bit644.0.5, bitops1.0-7, blob1.2.4, bslib0.6.1, cachem1.0.8, celldex1.8.0, checkmate2.3.1, circlize0.4.16, cli3.6.2, cluster2.1.3, clustree0.5.1, codetools0.2-18, colorspace2.1-0, cowplot1.1.3, crayon1.5.2, curl5.2.1, data.table1.15.2, DBI1.2.2, dbplyr2.2.1, DelayedArray0.24.0, DelayedMatrixStats1.20.0, deldir2.0-4, digest0.6.34, dotCall641.1-1, dplyr1.1.4, ellipsis0.3.2, enrichR3.2, evaluate0.23, ExperimentHub2.6.0, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, filelock1.0.3, fitdistrplus1.1-11, forcats1.0.0, future1.33.1, future.apply1.11.1, generics0.1.3, GenomeInfoDb1.34.9, GenomeInfoDbData1.2.9, GenomicRanges1.50.2, ggbeeswarm0.7.2, ggforce0.4.2, ggplot23.5.0, ggprism1.0.5, ggraph2.2.1, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, glmGamPoi1.10.2, GlobalOptions0.1.2, globals0.16.2, glue1.7.0, goftest1.2-3, graphlayouts1.1.1, gridExtra2.3, gridGraphics0.5-1, gtable0.3.4, highr0.10, hms1.1.3, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, interactiveDisplayBase1.36.0, IRanges2.32.0, irlba2.3.5.1, janitor2.2.0, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KEGGREST1.38.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, lifecycle1.0.4, listenv0.9.1, lmtest0.9-40, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, MAST1.27.1, Matrix1.6-5, MatrixGenerics1.10.0, matrixStats1.1.0, memoise2.0.1, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, openxlsx4.2.5.2, paletteer1.6.0, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pheatmap1.0.12, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, prettyunits1.2.0, progress1.2.3, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, ragg1.2.7, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, RCurl1.98-1.14, RefManageR1.4.0, rematch22.1.2, renv0.16.0, reshape21.4.4, reticulate1.35.0, rjson0.2.21, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, RSQLite2.3.5, rstudioapi0.15.0, rsvd1.0.5, Rtsne0.17, S4Vectors0.36.2, sass0.4.8, ScaledMatrix1.6.0, scales1.3.0, scattermore1.2, scCustomize2.1.2, sceasy0.0.7, sctransform0.4.1, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shape1.4.6.1, shiny1.8.0, SingleCellExperiment1.20.1, SingleR2.0.0, snakecase0.11.1, sp2.1-3, spam2.10-0, sparseMatrixStats1.10.0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, stringi1.8.3, stringr1.5.1, SummarizedExperiment1.28.0, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, textshaping0.3.7, tibble3.2.1, tidygraph1.3.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, tweenr2.0.3, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridis0.6.5, viridisLite0.4.2, withr3.0.0, WriteXLS6.7.0, xfun0.41, xml21.3.6, xtable1.8-4, XVector0.38.0, yaml2.3.8, zip2.3.1, zlibbioc1.44.0, zoo1.8-12 |
This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).
# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))